<!DOCTYPE HTML>
<html lang="en" >
    <!-- Start book Python数据分析课程讲义 -->
    <head>
        <!-- head:start -->
        <meta charset="UTF-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
        <title>jieba分词 | Python数据分析课程讲义</title>
        <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
        <meta name="description" content="">
        <meta name="generator" content="GitBook 2.6.7">
        <meta name="author" content="BigCat">
        
        <meta name="HandheldFriendly" content="true"/>
        <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
        <meta name="apple-mobile-web-app-capable" content="yes">
        <meta name="apple-mobile-web-app-status-bar-style" content="black">
        <link rel="apple-touch-icon-precomposed" sizes="152x152" href="../../gitbook/images/apple-touch-icon-precomposed-152.png">
        <link rel="shortcut icon" href="../../gitbook/images/favicon.ico" type="image/x-icon">
        
    <link rel="stylesheet" href="../../gitbook/style.css">
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-tbfed-pagefooter/footer.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-splitter/splitter.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-toggle-chapters/toggle.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-highlight/website.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-fontsettings/website.css">
        
    
    

        
    
    
    <link rel="next" href="../../file/part06/6.3.html" />
    
    
    <link rel="prev" href="../../file/part06/6.1.html" />
    

        <!-- head:end -->
    </head>
    <body>
        <!-- body:start -->
        
    <div class="book"
        data-level="5.2"
        data-chapter-title="jieba分词"
        data-filepath="file/part06/6.2.md"
        data-basepath="../.."
        data-revision="Thu Apr 27 2017 00:50:19 GMT+0800 (CST)"
        data-innerlanguage="">
    

<div class="book-summary">
    <nav role="navigation">
        <ul class="summary">
            
            
            
            

            

            
    
        <li class="chapter " data-level="0" data-path="index.html">
            
                
                    <a href="../../index.html">
                
                        <i class="fa fa-check"></i>
                        
                        传智播客Python学院数据分析
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1" data-path="file/part01/1.html">
            
                
                    <a href="../../file/part01/1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.</b>
                        
                        一、工作环境准备及数据分析建模理论基础
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="1.1" data-path="file/part01/1.1.html">
            
                
                    <a href="../../file/part01/1.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.1.</b>
                        
                        Python 3.x新特性和编码回顾
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.2" data-path="file/part01/1.2.html">
            
                
                    <a href="../../file/part01/1.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.2.</b>
                        
                        DIKW模型与数据工程
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.3" data-path="file/part01/1.3.html">
            
                
                    <a href="../../file/part01/1.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.3.</b>
                        
                        数据分析建模理论基础
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="2" data-path="file/part02/2.html">
            
                
                    <a href="../../file/part02/2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.</b>
                        
                        二、科学计算工具NumPy
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="2.1" data-path="file/part02/2.1.html">
            
                
                    <a href="../../file/part02/2.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.1.</b>
                        
                        ndarray的创建与数据类型
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.2" data-path="file/part02/2.2.html">
            
                
                    <a href="../../file/part02/2.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.2.</b>
                        
                        ndarray的矩阵处理
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.3" data-path="file/part02/2.3.html">
            
                
                    <a href="../../file/part02/2.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.3.</b>
                        
                        ndarray的元素处理
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.4" data-path="file/part02/2.4.html">
            
                
                    <a href="../../file/part02/2.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.4.</b>
                        
                        实战案例：2016美国总统大选民意调查统计
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="3" data-path="file/part03/3.html">
            
                
                    <a href="../../file/part03/3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.</b>
                        
                        三、数据分析工具Pandas
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="3.1" data-path="file/part03/3.1.html">
            
                
                    <a href="../../file/part03/3.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.1.</b>
                        
                        Pandas的数据结构
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.2" data-path="file/part03/3.2.html">
            
                
                    <a href="../../file/part03/3.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.2.</b>
                        
                        Pandas的索引操作
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.3" data-path="file/part03/3.3.html">
            
                
                    <a href="../../file/part03/3.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.3.</b>
                        
                        Pandas的对齐运算
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.4" data-path="file/part03/3.4.html">
            
                
                    <a href="../../file/part03/3.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.4.</b>
                        
                        Pandas的函数应用
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.5" data-path="file/part03/3.5.html">
            
                
                    <a href="../../file/part03/3.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.5.</b>
                        
                        Pandas的层级索引
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.6" data-path="file/part03/3.6.html">
            
                
                    <a href="../../file/part03/3.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.6.</b>
                        
                        Pandas统计计算和描述
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.7" data-path="file/part03/3.7.html">
            
                
                    <a href="../../file/part03/3.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.7.</b>
                        
                        Pandas分组与聚合
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.8" data-path="file/part03/3.8.html">
            
                
                    <a href="../../file/part03/3.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.8.</b>
                        
                        数据清洗、合并、转化和重构
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.9" data-path="file/part03/3.9.html">
            
                
                    <a href="../../file/part03/3.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.9.</b>
                        
                        聚类模型 -- K-Means介绍
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.10" data-path="file/part03/3.10.html">
            
                
                    <a href="../../file/part03/3.10.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.10.</b>
                        
                        实战案例：全球食品数据分析
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="4" data-path="file/part04/4.html">
            
                
                    <a href="../../file/part04/4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.</b>
                        
                        四、数据可视化工具
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="4.1" data-path="file/part04/4.1.html">
            
                
                    <a href="../../file/part04/4.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.1.</b>
                        
                        Matplotlib绘图
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.2" data-path="file/part04/4.2.html">
            
                
                    <a href="../../file/part04/4.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.2.</b>
                        
                        Seaborn绘图
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.3" data-path="file/part04/4.3.html">
            
                
                    <a href="../../file/part04/4.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.3.</b>
                        
                        Bokeh绘图
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.4" data-path="file/part04/4.4.html">
            
                
                    <a href="../../file/part04/4.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.4.</b>
                        
                        实战案例：世界高峰数据可视化
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="5" data-path="file/part06/6.html">
            
                
                    <a href="../../file/part06/6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.</b>
                        
                        五、自然语言处理NLTK
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="5.1" data-path="file/part06/6.1.html">
            
                
                    <a href="../../file/part06/6.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.1.</b>
                        
                        NLTK与自然语言处理基础
                    </a>
            
            
        </li>
    
        <li class="chapter active" data-level="5.2" data-path="file/part06/6.2.html">
            
                
                    <a href="../../file/part06/6.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.2.</b>
                        
                        jieba分词
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.3" data-path="file/part06/6.3.html">
            
                
                    <a href="../../file/part06/6.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.3.</b>
                        
                        情感分析
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.4" data-path="file/part06/6.4.html">
            
                
                    <a href="../../file/part06/6.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.4.</b>
                        
                        文本相似度和分类
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.5" data-path="file/part06/6.6.html">
            
                
                    <a href="../../file/part06/6.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.5.</b>
                        
                        实战案例：微博情感分析
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    


            
            <li class="divider"></li>
            <li>
                <a href="https://www.gitbook.com" target="blank" class="gitbook-link">
                    Published with GitBook
                </a>
            </li>
            
        </ul>
    </nav>
</div>

    <div class="book-body">
        <div class="body-inner">
            <div class="book-header" role="navigation">
    <!-- Actions Left -->
    

    <!-- Title -->
    <h1>
        <i class="fa fa-circle-o-notch fa-spin"></i>
        <a href="../../" >Python数据分析课程讲义</a>
    </h1>
</div>

            <div class="page-wrapper" tabindex="-1" role="main">
                <div class="page-inner">
                
                
                    <section class="normal" id="section-">
                    
                        <h1 id="jieba&#x5206;&#x8BCD;">jieba&#x5206;&#x8BCD;</h1>
<p>jieba&#x5206;&#x8BCD;&#x662F;python&#x5199;&#x6210;&#x7684;&#x4E00;&#x4E2A;&#x7B97;&#x662F;&#x5DE5;&#x4E1A;&#x754C;&#x7684;&#x5206;&#x8BCD;&#x5F00;&#x6E90;&#x5E93;&#xFF0C;&#x5176;github&#x5730;&#x5740;&#x4E3A;&#xFF1A;<a href="https://github.com/fxsjy/jieba]" target="_blank">https://github.com/fxsjy/jieba</a>&#xFF0C;&#x5728;Python&#x91CC;&#x7684;&#x5B89;&#x88C5;&#x65B9;&#x5F0F;&#xFF1A; <code>pip install jieba</code></p>
<p>&#x7B80;&#x5355;&#x793A;&#x4F8B;&#xFF1A;</p>
<pre><code class="lang-python"><span class="hljs-keyword">import</span> jieba <span class="hljs-keyword">as</span> jb

seg_list = jb.cut(<span class="hljs-string">&quot;&#x6211;&#x6765;&#x5230;&#x5317;&#x4EAC;&#x6E05;&#x534E;&#x5927;&#x5B66;&quot;</span>, cut_all=<span class="hljs-keyword">True</span>)
print(<span class="hljs-string">&quot;&#x5168;&#x6A21;&#x5F0F;: &quot;</span> + <span class="hljs-string">&quot;/ &quot;</span>.join(seg_list))  <span class="hljs-comment"># &#x5168;&#x6A21;&#x5F0F;</span>

seg_list = jb.cut(<span class="hljs-string">&quot;&#x6211;&#x6765;&#x5230;&#x5317;&#x4EAC;&#x6E05;&#x534E;&#x5927;&#x5B66;&quot;</span>, cut_all=<span class="hljs-keyword">False</span>)
print(<span class="hljs-string">&quot;&#x7CBE;&#x786E;&#x6A21;&#x5F0F;: &quot;</span> + <span class="hljs-string">&quot;/ &quot;</span>.join(seg_list))  <span class="hljs-comment"># &#x7CBE;&#x786E;&#x6A21;&#x5F0F;</span>

seg_list = jb.cut(<span class="hljs-string">&quot;&#x4ED6;&#x6765;&#x5230;&#x4E86;&#x7F51;&#x6613;&#x676D;&#x7814;&#x5927;&#x53A6;&quot;</span>)  
print(<span class="hljs-string">&quot;&#x9ED8;&#x8BA4;&#x6A21;&#x5F0F;: &quot;</span> + <span class="hljs-string">&quot;/ &quot;</span>.join(seg_list)) <span class="hljs-comment"># &#x9ED8;&#x8BA4;&#x662F;&#x7CBE;&#x786E;&#x6A21;&#x5F0F;</span>

seg_list = jb.cut_for_search(<span class="hljs-string">&quot;&#x5C0F;&#x660E;&#x7855;&#x58EB;&#x6BD5;&#x4E1A;&#x4E8E;&#x4E2D;&#x56FD;&#x79D1;&#x5B66;&#x9662;&#x8BA1;&#x7B97;&#x6240;&#xFF0C;&#x540E;&#x5728;&#x65E5;&#x672C;&#x4EAC;&#x90FD;&#x5927;&#x5B66;&#x6DF1;&#x9020;&quot;</span>)  
print(<span class="hljs-string">&quot;&#x641C;&#x7D22;&#x5F15;&#x64CE;&#x6A21;&#x5F0F;: &quot;</span> + <span class="hljs-string">&quot;/ &quot;</span>.join(seg_list)) <span class="hljs-comment"># &#x641C;&#x7D22;&#x5F15;&#x64CE;&#x6A21;&#x5F0F;</span>
</code></pre>
<p>&#x6267;&#x884C;&#x7ED3;&#x679C;&#xFF1A;</p>
<pre><code class="lang-python">&#x5168;&#x6A21;&#x5F0F;: &#x6211;/ &#x6765;&#x5230;/ &#x5317;&#x4EAC;/ &#x6E05;&#x534E;/ &#x6E05;&#x534E;&#x5927;&#x5B66;/ &#x534E;&#x5927;/ &#x5927;&#x5B66;
&#x7CBE;&#x786E;&#x6A21;&#x5F0F;: &#x6211;/ &#x6765;&#x5230;/ &#x5317;&#x4EAC;/ &#x6E05;&#x534E;&#x5927;&#x5B66;
&#x9ED8;&#x8BA4;&#x6A21;&#x5F0F;: &#x4ED6;/ &#x6765;&#x5230;/ &#x4E86;/ &#x7F51;&#x6613;/ &#x676D;&#x7814;/ &#x5927;&#x53A6;
&#x641C;&#x7D22;&#x5F15;&#x64CE;&#x6A21;&#x5F0F;: &#x5C0F;&#x660E;/ &#x7855;&#x58EB;/ &#x6BD5;&#x4E1A;/ &#x4E8E;/ &#x4E2D;&#x56FD;/ &#x79D1;&#x5B66;/ &#x5B66;&#x9662;/ &#x79D1;&#x5B66;&#x9662;/ &#x4E2D;&#x56FD;&#x79D1;&#x5B66;&#x9662;/ &#x8BA1;&#x7B97;/ &#x8BA1;&#x7B97;&#x6240;/ &#xFF0C;/ &#x540E;/ &#x5728;/ &#x65E5;&#x672C;/ &#x4EAC;&#x90FD;/ &#x5927;&#x5B66;/ &#x65E5;&#x672C;&#x4EAC;&#x90FD;&#x5927;&#x5B66;/ &#x6DF1;&#x9020;
</code></pre>
<h2 id="jieba&#x5206;&#x8BCD;&#x7684;&#x57FA;&#x672C;&#x601D;&#x8DEF;">jieba&#x5206;&#x8BCD;&#x7684;&#x57FA;&#x672C;&#x601D;&#x8DEF;</h2>
<p>jieba&#x5206;&#x8BCD;&#x5BF9;&#x5DF2;&#x6536;&#x5F55;&#x8BCD;&#x548C;&#x672A;&#x6536;&#x5F55;&#x8BCD;&#x90FD;&#x6709;&#x76F8;&#x5E94;&#x7684;&#x7B97;&#x6CD5;&#x8FDB;&#x884C;&#x5904;&#x7406;&#xFF0C;&#x5176;&#x5904;&#x7406;&#x7684;&#x601D;&#x8DEF;&#x5F88;&#x7B80;&#x5355;&#xFF0C;&#x4E3B;&#x8981;&#x7684;&#x5904;&#x7406;&#x601D;&#x8DEF;&#x5982;&#x4E0B;&#xFF1A;</p>
<blockquote>
<ul>
<li>&#x52A0;&#x8F7D;&#x8BCD;&#x5178;dict.txt</li>
<li>&#x4ECE;&#x5185;&#x5B58;&#x7684;&#x8BCD;&#x5178;&#x4E2D;&#x6784;&#x5EFA;&#x8BE5;&#x53E5;&#x5B50;&#x7684;DAG&#xFF08;&#x6709;&#x5411;&#x65E0;&#x73AF;&#x56FE;&#xFF09;</li>
<li>&#x5BF9;&#x4E8E;&#x8BCD;&#x5178;&#x4E2D;&#x672A;&#x6536;&#x5F55;&#x8BCD;&#xFF0C;&#x4F7F;&#x7528;HMM&#x6A21;&#x578B;&#x7684;viterbi&#x7B97;&#x6CD5;&#x5C1D;&#x8BD5;&#x5206;&#x8BCD;&#x5904;&#x7406;</li>
<li>&#x5DF2;&#x6536;&#x5F55;&#x8BCD;&#x548C;&#x672A;&#x6536;&#x5F55;&#x8BCD;&#x5168;&#x90E8;&#x5206;&#x8BCD;&#x5B8C;&#x6BD5;&#x540E;&#xFF0C;&#x4F7F;&#x7528;dp&#x5BFB;&#x627E;DAG&#x7684;&#x6700;&#x5927;&#x6982;&#x7387;&#x8DEF;&#x5F84;
&#x8F93;&#x51FA;&#x5206;&#x8BCD;&#x7ED3;&#x679C;</li>
</ul>
</blockquote>
<h2 id="&#x6848;&#x4F8B;&#xFF1A;">&#x6848;&#x4F8B;&#xFF1A;</h2>
<pre><code class="lang-python"><span class="hljs-comment">#!/usr/bin/env python</span>
<span class="hljs-comment"># -*- coding:utf-8 -*-</span>

<span class="hljs-keyword">import</span> jieba
<span class="hljs-keyword">import</span> requests
<span class="hljs-keyword">from</span> bs4 <span class="hljs-keyword">import</span> BeautifulSoup

<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">extract_text</span><span class="hljs-params">(url)</span>:</span>
    <span class="hljs-comment"># &#x53D1;&#x9001;url&#x8BF7;&#x6C42;&#x5E76;&#x83B7;&#x53D6;&#x54CD;&#x5E94;&#x6587;&#x4EF6;</span>
    page_source = requests.get(url).content
    bs_source = BeautifulSoup(page_source, <span class="hljs-string">&quot;lxml&quot;</span>)

    <span class="hljs-comment"># &#x89E3;&#x6790;&#x51FA;&#x6240;&#x6709;&#x7684;p&#x6807;&#x7B7E;</span>
    report_text = bs_source.find_all(<span class="hljs-string">&apos;p&apos;</span>)

    text = <span class="hljs-string">&apos;&apos;</span>
    <span class="hljs-comment"># &#x5C06;p&#x6807;&#x7B7E;&#x91CC;&#x7684;&#x6240;&#x6709;&#x5185;&#x5BB9;&#x90FD;&#x4FDD;&#x5B58;&#x5230;&#x4E00;&#x4E2A;&#x5B57;&#x7B26;&#x4E32;&#x91CC;</span>
    <span class="hljs-keyword">for</span> p <span class="hljs-keyword">in</span> report_text:
        text += p.get_text()
        text += <span class="hljs-string">&apos;\n&apos;</span>

    <span class="hljs-keyword">return</span> text

<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">word_frequency</span><span class="hljs-params">(text)</span>:</span>
    <span class="hljs-keyword">from</span> collections <span class="hljs-keyword">import</span> Counter
    <span class="hljs-comment"># &#x8FD4;&#x56DE;&#x6240;&#x6709;&#x5206;&#x8BCD;&#x540E;&#x957F;&#x5EA6;&#x5927;&#x4E8E;&#x7B49;&#x4E8E;2 &#x7684;&#x8BCD;&#x7684;&#x5217;&#x8868;</span>
    words = [word <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> jieba.cut(text, cut_all=<span class="hljs-keyword">True</span>) <span class="hljs-keyword">if</span> len(word) &gt;= <span class="hljs-number">2</span>]

    <span class="hljs-comment"># Counter&#x662F;&#x4E00;&#x4E2A;&#x7B80;&#x5355;&#x7684;&#x8BA1;&#x6570;&#x5668;&#xFF0C;&#x7EDF;&#x8BA1;&#x5B57;&#x7B26;&#x51FA;&#x73B0;&#x7684;&#x4E2A;&#x6570;</span>
    <span class="hljs-comment"># &#x5206;&#x8BCD;&#x540E;&#x7684;&#x5217;&#x8868;&#x5C06;&#x88AB;&#x8F6C;&#x5316;&#x4E3A;&#x5B57;&#x5178;</span>
    c = Counter(words)

    <span class="hljs-keyword">for</span> word_freq <span class="hljs-keyword">in</span> c.most_common(<span class="hljs-number">10</span>):
        word, freq = word_freq
        print(word, freq)

<span class="hljs-keyword">if</span> __name__ == <span class="hljs-string">&quot;__main__&quot;</span>:
    url = <span class="hljs-string">&apos;http://www.gov.cn/premier/2017-03/16/content_5177940.htm&apos;</span>
    text = extract_text(url)
    word_frequency(text)
</code></pre>
<p>&#x6267;&#x884C;&#x7ED3;&#x679C;&#xFF1A;</p>
<pre><code class="lang-python">Building prefix dict <span class="hljs-keyword">from</span> the default dictionary ...
Loading model <span class="hljs-keyword">from</span> cache /var/folders/dp/wxmmld_s7k9gk_5fbhdcr2y00000gn/T/jieba.cache
Loading model cost <span class="hljs-number">0.843</span> seconds.
Prefix dict has been built succesfully.
&#x53D1;&#x5C55; <span class="hljs-number">134</span>
&#x6539;&#x9769; <span class="hljs-number">85</span>
&#x7ECF;&#x6D4E; <span class="hljs-number">71</span>
&#x63A8;&#x8FDB; <span class="hljs-number">66</span>
&#x5EFA;&#x8BBE; <span class="hljs-number">59</span>
&#x793E;&#x4F1A; <span class="hljs-number">49</span>
&#x4EBA;&#x6C11; <span class="hljs-number">47</span>
&#x4F01;&#x4E1A; <span class="hljs-number">46</span>
&#x52A0;&#x5F3A; <span class="hljs-number">46</span>
&#x653F;&#x7B56; <span class="hljs-number">46</span>
</code></pre>
<h5 id="&#x6D41;&#x7A0B;&#x4ECB;&#x7ECD;">&#x6D41;&#x7A0B;&#x4ECB;&#x7ECD;</h5>
<ol>
<li><p>&#x9996;&#x5148;&#xFF0C;&#x6211;&#x4EEC;&#x4ECE;&#x7F51;&#x4E0A;&#x6293;&#x53D6;&#x653F;&#x5E9C;&#x5DE5;&#x4F5C;&#x62A5;&#x544A;&#x7684;&#x5168;&#x6587;&#x3002;&#x6211;&#x5C06;&#x8FD9;&#x4E2A;&#x6B65;&#x9AA4;&#x5C01;&#x88C5;&#x5728;&#x4E00;&#x4E2A;&#x540D;&#x53EB;extract_text&#x7684;&#x7B80;&#x5355;&#x51FD;&#x6570;&#x4E2D;&#xFF0C;&#x63A5;&#x53D7;url&#x4F5C;&#x4E3A;&#x53C2;&#x6570;&#x3002;&#x56E0;&#x4E3A;&#x76EE;&#x6807;&#x9875;&#x9762;&#x4E2D;&#x62A5;&#x544A;&#x7684;&#x6587;&#x672C;&#x5728;&#x6240;&#x6709;&#x7684;p&#x5143;&#x7D20;&#x4E2D;&#xFF0C;&#x6240;&#x4EE5;&#x6211;&#x4EEC;&#x53EA;&#x9700;&#x8981;&#x901A;&#x8FC7;BeautifulSoup&#x9009;&#x4E2D;&#x5168;&#x90E8;&#x7684;p&#x5143;&#x7D20;&#x5373;&#x53EF;&#xFF0C;&#x6700;&#x540E;&#x8FD4;&#x56DE;&#x4E00;&#x4E2A;&#x5305;&#x542B;&#x4E86;&#x62A5;&#x544A;&#x6B63;&#x6587;&#x7684;&#x5B57;&#x7B26;&#x4E32;&#x3002;</p>
</li>
<li><p>&#x7136;&#x540E;&#xFF0C;&#x6211;&#x4EEC;&#x5C31;&#x53EF;&#x4EE5;&#x5229;&#x7528;jieba&#x8FDB;&#x884C;&#x5206;&#x8BCD;&#x4E86;&#x3002;&#x8FD9;&#x91CC;&#xFF0C;&#x6211;&#x4EEC;&#x8981;&#x9009;&#x62E9;&#x5168;&#x6A21;&#x5F0F;&#x5206;&#x8BCD;&#x3002;jieba&#x7684;&#x5168;&#x6A21;&#x5F0F;&#x5206;&#x8BCD;&#xFF0C;&#x5373;&#x628A;&#x53E5;&#x5B50;&#x4E2D;&#x6240;&#x6709;&#x7684;&#x53EF;&#x4EE5;&#x6210;&#x8BCD;&#x7684;&#x8BCD;&#x8BED;&#x90FD;&#x626B;&#x63CF;&#x51FA;&#x6765;, &#x901F;&#x5EA6;&#x975E;&#x5E38;&#x5FEB;&#xFF0C;&#x4F46;&#x662F;&#x4E0D;&#x80FD;&#x89E3;&#x51B3;&#x6B67;&#x4E49;&#x3002;&#x4E4B;&#x6240;&#x4EE5;&#x8FD9;&#x4E48;&#x505A;&#xFF0C;&#x662F;&#x56E0;&#x4E3A;&#x9ED8;&#x8BA4;&#x7684;&#x7CBE;&#x786E;&#x6A21;&#x5F0F;&#x4E0B;&#xFF0C;&#x8FD4;&#x56DE;&#x7684;&#x8BCD;&#x9891;&#x6570;&#x636E;&#x4E0D;&#x51C6;&#x786E;&#x3002;</p>
</li>
<li><p>&#x5206;&#x8BCD;&#x65F6;&#xFF0C;&#x8FD8;&#x8981;&#x6CE8;&#x610F;&#x53BB;&#x9664;&#x6807;&#x70B9;&#x7B26;&#x53F7;&#xFF0C;&#x7531;&#x4E8E;&#x6807;&#x70B9;&#x7B26;&#x53F7;&#x7684;&#x957F;&#x5EA6;&#x90FD;&#x662F;1&#xFF0C;&#x6240;&#x4EE5;&#x6211;&#x4EEC;&#x6DFB;&#x52A0;&#x4E00;&#x4E2A;len(word) &gt;= 2&#x7684;&#x6761;&#x4EF6;&#x5373;&#x53EF;&#x3002;</p>
</li>
<li><p>&#x6700;&#x540E;&#xFF0C;&#x6211;&#x4EEC;&#x5C31;&#x53EF;&#x4EE5;&#x5229;&#x7528;Counter&#x7C7B;&#xFF0C;&#x5C06;&#x5206;&#x8BCD;&#x540E;&#x7684;&#x5217;&#x8868;&#x5FEB;&#x901F;&#x5730;&#x8F6C;&#x5316;&#x4E3A;&#x5B57;&#x5178;&#xFF0C;&#x5176;&#x4E2D;&#x7684;&#x952E;&#x503C;&#x5C31;&#x662F;&#x952E;&#x7684;&#x51FA;&#x73B0;&#x6B21;&#x6570;&#xFF0C;&#x4E5F;&#x5C31;&#x662F;&#x8FD9;&#x4E2A;&#x8BCD;&#x5728;&#x5168;&#x6587;&#x4E2D;&#x51FA;&#x73B0;&#x7684;&#x6B21;&#x6570;&#x3002; </p>
</li>
</ol>
<footer class="page-footer"><span class="copyright">Copyright &#xA9; BigCat all right reserved&#xFF0C;powered by Gitbook</span><span class="footer-modification">&#x300C;Revision Time:
2017-04-27 00:22:49&#x300D;
</span></footer>
                    
                    </section>
                
                
                </div>
            </div>
        </div>

        
        <a href="../../file/part06/6.1.html" class="navigation navigation-prev " aria-label="Previous page: NLTK与自然语言处理基础"><i class="fa fa-angle-left"></i></a>
        
        
        <a href="../../file/part06/6.3.html" class="navigation navigation-next " aria-label="Next page: 情感分析"><i class="fa fa-angle-right"></i></a>
        
    </div>
</div>

        
<script src="../../gitbook/app.js"></script>

    
    <script src="../../gitbook/plugins/gitbook-plugin-splitter/splitter.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-toggle-chapters/toggle.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-fontsettings/buttons.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-livereload/plugin.js"></script>
    

<script>
require(["gitbook"], function(gitbook) {
    var config = {"disqus":{"shortName":"gitbookuse"},"github":{"url":"https://github.com/dododream"},"search-pro":{"cutWordLib":"nodejieba","defineWord":["gitbook-use"]},"sharing":{"weibo":true,"facebook":true,"twitter":true,"google":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"tbfed-pagefooter":{"copyright":"Copyright © BigCat","modify_label":"「Revision Time:","modify_format":"YYYY-MM-DD HH:mm:ss」"},"baidu":{"token":"ff100361cdce95dd4c8fb96b4009f7bc"},"sitemap":{"hostname":"http://www.treenewbee.top"},"donate":{"wechat":"http://weixin.png","alipay":"http://alipay.png","title":"","button":"赏","alipayText":"支付宝打赏","wechatText":"微信打赏"},"edit-link":{"base":"https://github.com/dododream/edit","label":"Edit This Page"},"splitter":{},"toggle-chapters":{},"highlight":{},"fontsettings":{"theme":"white","family":"sans","size":2},"livereload":{}};
    gitbook.start(config);
});
</script>

        <!-- body:end -->
    </body>
    <!-- End of book Python数据分析课程讲义 -->
</html>
